{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "45211\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>age</th>\n",
       "      <th>job</th>\n",
       "      <th>marital</th>\n",
       "      <th>education</th>\n",
       "      <th>default</th>\n",
       "      <th>balance</th>\n",
       "      <th>housing</th>\n",
       "      <th>loan</th>\n",
       "      <th>contact</th>\n",
       "      <th>day</th>\n",
       "      <th>month</th>\n",
       "      <th>duration</th>\n",
       "      <th>campaign</th>\n",
       "      <th>pdays</th>\n",
       "      <th>previous</th>\n",
       "      <th>poutcome</th>\n",
       "      <th>y</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>58</td>\n",
       "      <td>management</td>\n",
       "      <td>married</td>\n",
       "      <td>tertiary</td>\n",
       "      <td>no</td>\n",
       "      <td>2143</td>\n",
       "      <td>yes</td>\n",
       "      <td>no</td>\n",
       "      <td>unknown</td>\n",
       "      <td>5</td>\n",
       "      <td>may</td>\n",
       "      <td>261</td>\n",
       "      <td>1</td>\n",
       "      <td>-1</td>\n",
       "      <td>0</td>\n",
       "      <td>unknown</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>44</td>\n",
       "      <td>technician</td>\n",
       "      <td>single</td>\n",
       "      <td>secondary</td>\n",
       "      <td>no</td>\n",
       "      <td>29</td>\n",
       "      <td>yes</td>\n",
       "      <td>no</td>\n",
       "      <td>unknown</td>\n",
       "      <td>5</td>\n",
       "      <td>may</td>\n",
       "      <td>151</td>\n",
       "      <td>1</td>\n",
       "      <td>-1</td>\n",
       "      <td>0</td>\n",
       "      <td>unknown</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>33</td>\n",
       "      <td>entrepreneur</td>\n",
       "      <td>married</td>\n",
       "      <td>secondary</td>\n",
       "      <td>no</td>\n",
       "      <td>2</td>\n",
       "      <td>yes</td>\n",
       "      <td>yes</td>\n",
       "      <td>unknown</td>\n",
       "      <td>5</td>\n",
       "      <td>may</td>\n",
       "      <td>76</td>\n",
       "      <td>1</td>\n",
       "      <td>-1</td>\n",
       "      <td>0</td>\n",
       "      <td>unknown</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>47</td>\n",
       "      <td>blue-collar</td>\n",
       "      <td>married</td>\n",
       "      <td>unknown</td>\n",
       "      <td>no</td>\n",
       "      <td>1506</td>\n",
       "      <td>yes</td>\n",
       "      <td>no</td>\n",
       "      <td>unknown</td>\n",
       "      <td>5</td>\n",
       "      <td>may</td>\n",
       "      <td>92</td>\n",
       "      <td>1</td>\n",
       "      <td>-1</td>\n",
       "      <td>0</td>\n",
       "      <td>unknown</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>33</td>\n",
       "      <td>unknown</td>\n",
       "      <td>single</td>\n",
       "      <td>unknown</td>\n",
       "      <td>no</td>\n",
       "      <td>1</td>\n",
       "      <td>no</td>\n",
       "      <td>no</td>\n",
       "      <td>unknown</td>\n",
       "      <td>5</td>\n",
       "      <td>may</td>\n",
       "      <td>198</td>\n",
       "      <td>1</td>\n",
       "      <td>-1</td>\n",
       "      <td>0</td>\n",
       "      <td>unknown</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   age           job  marital  education default  balance housing loan  \\\n",
       "0   58    management  married   tertiary      no     2143     yes   no   \n",
       "1   44    technician   single  secondary      no       29     yes   no   \n",
       "2   33  entrepreneur  married  secondary      no        2     yes  yes   \n",
       "3   47   blue-collar  married    unknown      no     1506     yes   no   \n",
       "4   33       unknown   single    unknown      no        1      no   no   \n",
       "\n",
       "   contact  day month  duration  campaign  pdays  previous poutcome   y  \n",
       "0  unknown    5   may       261         1     -1         0  unknown  no  \n",
       "1  unknown    5   may       151         1     -1         0  unknown  no  \n",
       "2  unknown    5   may        76         1     -1         0  unknown  no  \n",
       "3  unknown    5   may        92         1     -1         0  unknown  no  \n",
       "4  unknown    5   may       198         1     -1         0  unknown  no  "
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# df = pd.read_csv(\"../Data/bank.csv\", sep=\";\")\n",
    "df = pd.read_csv(\"../Data/bank-full.csv\", sep=\";\")\n",
    "print(len(df))\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style>\n",
       "    .dataframe thead tr:only-child th {\n",
       "        text-align: right;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: left;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>age</th>\n",
       "      <th>job</th>\n",
       "      <th>marital</th>\n",
       "      <th>education</th>\n",
       "      <th>default</th>\n",
       "      <th>balance</th>\n",
       "      <th>housing</th>\n",
       "      <th>loan</th>\n",
       "      <th>contact</th>\n",
       "      <th>day</th>\n",
       "      <th>month</th>\n",
       "      <th>duration</th>\n",
       "      <th>campaign</th>\n",
       "      <th>pdays</th>\n",
       "      <th>previous</th>\n",
       "      <th>poutcome</th>\n",
       "      <th>y</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>count</th>\n",
       "      <td>45211.000000</td>\n",
       "      <td>45211</td>\n",
       "      <td>45211</td>\n",
       "      <td>45211</td>\n",
       "      <td>45211</td>\n",
       "      <td>45211.000000</td>\n",
       "      <td>45211</td>\n",
       "      <td>45211</td>\n",
       "      <td>45211</td>\n",
       "      <td>45211.000000</td>\n",
       "      <td>45211</td>\n",
       "      <td>45211.000000</td>\n",
       "      <td>45211.000000</td>\n",
       "      <td>45211.000000</td>\n",
       "      <td>45211.000000</td>\n",
       "      <td>45211</td>\n",
       "      <td>45211</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>unique</th>\n",
       "      <td>NaN</td>\n",
       "      <td>12</td>\n",
       "      <td>3</td>\n",
       "      <td>4</td>\n",
       "      <td>2</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>3</td>\n",
       "      <td>NaN</td>\n",
       "      <td>12</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>4</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>top</th>\n",
       "      <td>NaN</td>\n",
       "      <td>blue-collar</td>\n",
       "      <td>married</td>\n",
       "      <td>secondary</td>\n",
       "      <td>no</td>\n",
       "      <td>NaN</td>\n",
       "      <td>yes</td>\n",
       "      <td>no</td>\n",
       "      <td>cellular</td>\n",
       "      <td>NaN</td>\n",
       "      <td>may</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>unknown</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>freq</th>\n",
       "      <td>NaN</td>\n",
       "      <td>9732</td>\n",
       "      <td>27214</td>\n",
       "      <td>23202</td>\n",
       "      <td>44396</td>\n",
       "      <td>NaN</td>\n",
       "      <td>25130</td>\n",
       "      <td>37967</td>\n",
       "      <td>29285</td>\n",
       "      <td>NaN</td>\n",
       "      <td>13766</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>36959</td>\n",
       "      <td>39922</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mean</th>\n",
       "      <td>40.936210</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1362.272058</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>15.806419</td>\n",
       "      <td>NaN</td>\n",
       "      <td>258.163080</td>\n",
       "      <td>2.763841</td>\n",
       "      <td>40.197828</td>\n",
       "      <td>0.580323</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>std</th>\n",
       "      <td>10.618762</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>3044.765829</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>8.322476</td>\n",
       "      <td>NaN</td>\n",
       "      <td>257.527812</td>\n",
       "      <td>3.098021</td>\n",
       "      <td>100.128746</td>\n",
       "      <td>2.303441</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>min</th>\n",
       "      <td>18.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>-8019.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>-1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25%</th>\n",
       "      <td>33.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>72.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>8.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>103.000000</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>-1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50%</th>\n",
       "      <td>39.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>448.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>16.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>180.000000</td>\n",
       "      <td>2.000000</td>\n",
       "      <td>-1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>75%</th>\n",
       "      <td>48.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1428.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>21.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>319.000000</td>\n",
       "      <td>3.000000</td>\n",
       "      <td>-1.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>max</th>\n",
       "      <td>95.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>102127.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>31.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>4918.000000</td>\n",
       "      <td>63.000000</td>\n",
       "      <td>871.000000</td>\n",
       "      <td>275.000000</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                 age          job  marital  education default        balance  \\\n",
       "count   45211.000000        45211    45211      45211   45211   45211.000000   \n",
       "unique           NaN           12        3          4       2            NaN   \n",
       "top              NaN  blue-collar  married  secondary      no            NaN   \n",
       "freq             NaN         9732    27214      23202   44396            NaN   \n",
       "mean       40.936210          NaN      NaN        NaN     NaN    1362.272058   \n",
       "std        10.618762          NaN      NaN        NaN     NaN    3044.765829   \n",
       "min        18.000000          NaN      NaN        NaN     NaN   -8019.000000   \n",
       "25%        33.000000          NaN      NaN        NaN     NaN      72.000000   \n",
       "50%        39.000000          NaN      NaN        NaN     NaN     448.000000   \n",
       "75%        48.000000          NaN      NaN        NaN     NaN    1428.000000   \n",
       "max        95.000000          NaN      NaN        NaN     NaN  102127.000000   \n",
       "\n",
       "       housing   loan   contact           day  month      duration  \\\n",
       "count    45211  45211     45211  45211.000000  45211  45211.000000   \n",
       "unique       2      2         3           NaN     12           NaN   \n",
       "top        yes     no  cellular           NaN    may           NaN   \n",
       "freq     25130  37967     29285           NaN  13766           NaN   \n",
       "mean       NaN    NaN       NaN     15.806419    NaN    258.163080   \n",
       "std        NaN    NaN       NaN      8.322476    NaN    257.527812   \n",
       "min        NaN    NaN       NaN      1.000000    NaN      0.000000   \n",
       "25%        NaN    NaN       NaN      8.000000    NaN    103.000000   \n",
       "50%        NaN    NaN       NaN     16.000000    NaN    180.000000   \n",
       "75%        NaN    NaN       NaN     21.000000    NaN    319.000000   \n",
       "max        NaN    NaN       NaN     31.000000    NaN   4918.000000   \n",
       "\n",
       "            campaign         pdays      previous poutcome      y  \n",
       "count   45211.000000  45211.000000  45211.000000    45211  45211  \n",
       "unique           NaN           NaN           NaN        4      2  \n",
       "top              NaN           NaN           NaN  unknown     no  \n",
       "freq             NaN           NaN           NaN    36959  39922  \n",
       "mean        2.763841     40.197828      0.580323      NaN    NaN  \n",
       "std         3.098021    100.128746      2.303441      NaN    NaN  \n",
       "min         1.000000     -1.000000      0.000000      NaN    NaN  \n",
       "25%         1.000000     -1.000000      0.000000      NaN    NaN  \n",
       "50%         2.000000     -1.000000      0.000000      NaN    NaN  \n",
       "75%         3.000000     -1.000000      0.000000      NaN    NaN  \n",
       "max        63.000000    871.000000    275.000000      NaN    NaN  "
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.describe(include=\"all\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Check for nulls"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "for col in df.columns:\n",
    "    if df[col].isnull().sum() !=0:\n",
    "        print(col, df[col].isnull().sum())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Find Categorical Variables"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome', 'y']\n"
     ]
    }
   ],
   "source": [
    "cat_vars = []\n",
    "for col in df.columns:\n",
    "    if df[col].dtype != \"int64\":\n",
    "        cat_vars.append(col)\n",
    "#         print(col)#, df[col].dtype)\n",
    "print(cat_vars)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "blue-collar      9732\n",
      "management       9458\n",
      "technician       7597\n",
      "admin.           5171\n",
      "services         4154\n",
      "retired          2264\n",
      "self-employed    1579\n",
      "entrepreneur     1487\n",
      "unemployed       1303\n",
      "housemaid        1240\n",
      "student           938\n",
      "unknown           288\n",
      "Name: job, dtype: int64\n",
      "married     27214\n",
      "single      12790\n",
      "divorced     5207\n",
      "Name: marital, dtype: int64\n",
      "secondary    23202\n",
      "tertiary     13301\n",
      "primary       6851\n",
      "unknown       1857\n",
      "Name: education, dtype: int64\n",
      "no     44396\n",
      "yes      815\n",
      "Name: default, dtype: int64\n",
      "yes    25130\n",
      "no     20081\n",
      "Name: housing, dtype: int64\n",
      "no     37967\n",
      "yes     7244\n",
      "Name: loan, dtype: int64\n",
      "cellular     29285\n",
      "unknown      13020\n",
      "telephone     2906\n",
      "Name: contact, dtype: int64\n",
      "may    13766\n",
      "jul     6895\n",
      "aug     6247\n",
      "jun     5341\n",
      "nov     3970\n",
      "apr     2932\n",
      "feb     2649\n",
      "jan     1403\n",
      "oct      738\n",
      "sep      579\n",
      "mar      477\n",
      "dec      214\n",
      "Name: month, dtype: int64\n",
      "unknown    36959\n",
      "failure     4901\n",
      "other       1840\n",
      "success     1511\n",
      "Name: poutcome, dtype: int64\n",
      "no     39922\n",
      "yes     5289\n",
      "Name: y, dtype: int64\n"
     ]
    }
   ],
   "source": [
    "for var in cat_vars:\n",
    "    print(df[var].value_counts())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.8222222222222222\n"
     ]
    }
   ],
   "source": [
    "print(37000/45000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "for var in cat_vars:\n",
    "    if var == \"education\":\n",
    "        cat_vars.remove(var)\n",
    "print(cat_vars)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
